rm(list=ls())
library(tidyverse)

# NOTE: This script requires data files not included in the replication deposit:
#   - FullContactData.csv (contains PII from sampling frame)
#   - CleanedDeIdentifiedSPEPS02022021.csv
#   - ID_CrosswalkSPEPS02022021.csv
# Contact the authors for access to these files.

# Set working directory to the repository root
# Update these paths to match your local file locations
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
contact_data <- read_csv("FullContactData.csv")

dat <- read_csv("CleanedDeIdentifiedSPEPS02022021.csv")

dat$weight_use


contact_data <- contact_data %>% mutate(num_careers = str_count(Careers, ";"))
contact_data$num_careers <- contact_data$num_careers +1
contact_data$num_careers[is.na(contact_data$num_careers)] <- 0
summary(contact_data$num_careers)

library(lubridate)
contact_data$Start.Year <- str_sub(contact_data$Start.Date, -4,-1)

summary(as.numeric(contact_data$Start.Year))

table(as.factor(contact_data$Political.Affiliation), as.factor(contact_data$Group))

contact_data$inferred.Party <- "Unk"
contact_data$inferred.Party[contact_data$Political.Affiliation == "Democrat"] <- "D"
contact_data$inferred.Party[contact_data$Political.Affiliation == "Democrat Independent"] <- "D"
contact_data$inferred.Party[contact_data$Political.Affiliation == "Democrat Working Families"] <- "D"
contact_data$inferred.Party[contact_data$Political.Affiliation == "Democratic Farmer Labor"] <- "D"
contact_data$inferred.Party[contact_data$Political.Affiliation == "Republican"] <- "R"


contact_data$num_career_quintile <- ntile(contact_data$num_careers, 5)

contact_data$career_years <- 2020 - as.numeric(contact_data$Start.Year)

ntile_na <- function(x,n)
{
  notna <- !is.na(x)
  out <- rep(NA_real_,length(x))
  out[notna] <- ntile(x[notna],n)
  return(out)
}
contact_data$career_years_quintile <- ntile_na(contact_data$career_years, 5)

contact_data$career_years_quintile[is.na(contact_data$career_years)] <- "Unk"

summary(as.factor(contact_data$career_years_quintile))

#contact_data <- contact_data %>% filter(Country == "USA")

ids <- read_csv("ID_CrosswalkSPEPS02022021.csv")

respondents <- readRDS(file = "Data/EliteGWData.rds")
respondents

dat <- dat %>% left_join(ids)

dat_id_weights <- dat %>% filter(ResponseId %in% respondents$ResponseId) %>%
  dplyr::select(RecipientEmail, weight_use) %>% group_by(RecipientEmail) %>% summarise(weight = sum(weight_use))

dat_id_weights

contact_data <- contact_data %>% left_join(dat_id_weights, by = c("Email" = "RecipientEmail"))

contact_data$Respondent <- 0

contact_data$Respondent[!is.na(contact_data$weight)] <- 1

contact_data$weight[is.na(contact_data$weight)] <- 1

View(contact_data)


# install.packages(c("survey", "dplyr", "tibble", "purrr"))
library(survey)
library(dplyr)
library(tibble)
library(purrr)

# 1) Survey design object (no clustering/strata info provided -> ids = ~1)
des <- svydesign(
  ids = ~1,
  weights = ~weight,
  data = contact_data
)

# Ensure group is treated as a factor (useful for tabulations / model matrices)
des <- update(des, Respondent = factor(Respondent, levels = c(0, 1)))

cat_vars <- c("State", "Gender", "Group")
num_vars <- c("num_careers", "career_years")

# -----------------------------
# CATEGORICAL BALANCE TESTS
# Rao–Scott chi-square via svychisq
# -----------------------------
cat_tests <- map_dfr(cat_vars, function(v) {
  fml <- as.formula(paste0("~", v, "+Respondent"))
  tst <- svychisq(fml, design = des, statistic = "F")  # "F" gives design-based F test
  
  tibble(
    variable = v,
    test = "Rao-Scott chi-square (design-based F)",
    statistic = unname(tst$statistic),
    df_num = unname(tst$parameter[1]),
    df_den = unname(tst$parameter[2]),
    p_value = unname(tst$p.value)
  )
})

# -----------------------------
# NUMERIC BALANCE TESTS
# Difference in weighted means between Respondent=1 vs 0
# Implemented as svyglm(y ~ Respondent) and extract Respondent coefficient
# -----------------------------
num_tests <- map_dfr(num_vars, function(v) {
  fml <- as.formula(paste0(v, " ~ Respondent"))
  fit <- svyglm(fml, design = des)
  
  co <- summary(fit)$coefficients
  # Row name is "Respondent1" because Respondent is a factor with baseline 0
  rn <- grep("^Respondent", rownames(co), value = TRUE)[1]
  
  tibble(
    variable = v,
    test = "Difference in weighted means (Respondent=1 minus 0)",
    estimate = unname(co[rn, "Estimate"]),
    se = unname(co[rn, "Std. Error"]),
    t = unname(co[rn, "t value"]),
    p_value = unname(co[rn, "Pr(>|t|)"])
  )
})

# Combine
balance_results <- bind_rows(
  cat_tests %>% select(variable, test, statistic, df_num, df_den, p_value),
  num_tests %>% select(variable, test, estimate, se, t, p_value)
)

num_vars <- c("num_careers", "Start.Year", "career_years")

weighted_smd_numeric <- function(var, des) {
  
  # weighted means by group
  m <- svyby(
    as.formula(paste0("~", var)),
    ~Respondent,
    des,
    svymean,
    na.rm = TRUE
  )
  
  # weighted variances by group
  v <- svyby(
    as.formula(paste0("~", var)),
    ~Respondent,
    des,
    svyvar,
    na.rm = TRUE
  )
  
  m0 <- m[[var]][m$Respondent == 0]
  m1 <- m[[var]][m$Respondent == 1]
  
  v0 <- v[[var]][v$Respondent == 0]
  v1 <- v[[var]][v$Respondent == 1]
  
  smd <- (m1 - m0) / sqrt((v1 + v0) / 2)
  
  tibble(
    variable = var,
    mean_respondent_0 = m0,
    mean_respondent_1 = m1,
    smd = smd
  )
}

num_smds <- map_dfr(num_vars, weighted_smd_numeric, des = des)

num_smds

cat_vars <- c("State", "Gender", "Group")

weighted_smd_categorical <- function(var, des) {
  
  # Ensure variable is treated as a factor inside the design
  des2 <- update(des, tmp_cat = factor(get(var)))
  
  # Weighted two-way table: Respondent x level(tmp_cat)
  tab <- svytable(~ Respondent + tmp_cat, design = des2)
  
  # Convert to row proportions: p(level | Respondent)
  props <- prop.table(tab, margin = 1)
  
  # Ensure both groups exist
  rn <- rownames(props)
  if (!all(c("0", "1") %in% rn)) {
    return(tibble(variable = var, smd_max = NA_real_))
  }
  
  p0 <- as.numeric(props["0", ])
  p1 <- as.numeric(props["1", ])
  
  # Level-wise standardized differences in proportions
  smds <- (p1 - p0) / sqrt((p1 * (1 - p1) + p0 * (1 - p0)) / 2)
  
  tibble(
    variable = var,
    smd_max = max(abs(smds), na.rm = TRUE)
  )
}

cat_vars <- c("State", "Gender", "Group")

cat_smds <- map_dfr(cat_vars, weighted_smd_categorical, des = des)
cat_smds

numeric_balance_table <- num_tests %>%
  left_join(num_smds, by = "variable") %>%
  rename(
    mean_diff = estimate,
    std_error = se
  ) %>%
  select(
    variable,
    mean_respondent_0,
    mean_respondent_1,
    mean_diff,
    std_error,
    smd,
    p_value
  ) %>%
  mutate(
    across(c(mean_respondent_0, mean_respondent_1,
             mean_diff, std_error, smd),
           ~ round(., 3)),
    p_value = signif(p_value, 3)
  )

numeric_balance_table

categorical_balance_table <- cat_tests %>%
  select(variable, statistic, df_num, df_den, p_value) %>%
  left_join(
    cat_smds %>% rename(smd_max = smd_max),
    by = "variable"
  ) %>%
  mutate(
    statistic = round(statistic, 2),
    df_num = round(df_num, 2),
    df_den = round(df_den, 0),
    smd_max = round(smd_max, 3),
    p_value = signif(p_value, 3),
    interpretation = case_when(
      smd_max < 0.10 ~ "Negligible",
      smd_max < 0.25 ~ "Modest",
      TRUE ~ "Moderate"
    )
  ) %>%
  select(
    variable,
    statistic,
    df_num,
    df_den,
    p_value,
    smd_max,
    interpretation
  )

categorical_balance_table



weighted_smd_categorical_full <- function(var, des) {
  
  # Create a temporary factor inside the survey design
  des2 <- update(des, tmp_cat = factor(get(var)))
  
  # Weighted two-way table: Respondent x category
  tab <- survey::svytable(~ Respondent + tmp_cat, design = des2)
  
  # Convert to row proportions (within Respondent)
  props <- prop.table(tab, margin = 1)
  
  # Ensure both groups exist
  if (!all(c("0", "1") %in% rownames(props))) {
    return(tibble::tibble(
      variable = var,
      level = NA_character_,
      p0 = NA_real_,
      p1 = NA_real_,
      smd = NA_real_
    ))
  }
  
  # Extract weighted proportions
  p0 <- as.numeric(props["0", ])
  p1 <- as.numeric(props["1", ])
  
  # Level-specific standardized differences
  smds <- (p1 - p0) /
    sqrt((p1 * (1 - p1) + p0 * (1 - p0)) / 2)
  
  tibble::tibble(
    variable = var,
    level = colnames(props),
    p0 = p0,
    p1 = p1,
    smd = smds
  )
}

state_level_balance <- weighted_smd_categorical_full("State", des) %>%
  dplyr::mutate(abs_smd = abs(smd)) %>%
  dplyr::arrange(desc(abs_smd))

state_level_balance <- state_level_balance %>% dplyr::slice(1:10)

# Group-level imbalance
group_level_balance <- weighted_smd_categorical_full("Group", des) %>%
  dplyr::mutate(abs_smd = abs(smd)) %>%
  dplyr::arrange(desc(abs_smd))

group_level_balance 


library(dplyr)
library(tidyr)


respondents_dec2 <- dat %>%
  mutate(
    # Relevel existing factors (or coerce then relevel)
    hisp = fct_relevel(factor(hisp), "No", "Yes"),
    
    age = fct_relevel(
      factor(age),
      "21 - 25", "26 - 30", "31 - 35", "36 - 40", "41 - 45",
      "46 - 50", "51 - 55", "56 - 60", "> 60"
    ),
    
    party.relevance_1 = fct_relevel(
      factor(party.relevance_1),
      "Strongly disagree", "Somewhat disagree", "Neither agree nor disagree",
      "Somewhat agree", "Strongly agree"
    ),
    party.relevance_2 = fct_relevel(
      factor(party.relevance_2),
      "Strongly disagree", "Somewhat disagree", "Neither agree nor disagree",
      "Somewhat agree", "Strongly agree"
    ),
    party.relevance_3 = fct_relevel(
      factor(party.relevance_3),
      "Strongly disagree", "Somewhat disagree", "Neither agree nor disagree",
      "Somewhat agree", "Strongly agree"
    ),
    
    # Collapsed race
    race2 = case_when(
      is.na(race) ~ NA_character_,
      race == "White" ~ "White",
      TRUE ~ "POC"
    ) %>% factor(levels = c("White", "POC")),
    
    # Collapsed gender
    gender2 = case_when(
      is.na(gender) ~ NA_character_,
      gender == "Prefer not to say" ~ NA_character_,
      gender == "Man" ~ "Man",
      gender == "Woman" ~ "Woman",
      TRUE ~ "Trans/Non-Binary/Self Describe"
    ) %>% factor(levels = c("Man", "Woman", "Trans/Non-Binary/Self Describe")),
    
    # Collapsed LGBT
    lgbt2 = case_when(
      is.na(lgbt) ~ NA_character_,
      lgbt == "Prefer not to say" ~ "Prefer not to say",
      lgbt == "Yes" ~ "Yes",
      TRUE ~ "No"
    ) %>% factor(levels = c("No", "Yes", "Prefer not to say"))
  ) %>%
  select(
    # weights / identifiers
    weight_use,
    
    # demographics
    hisp,
    race2,
    gender2,
    lgbt2,
    age,
    
    # political relevance items
    party.relevance_1,
    party.relevance_2,
    party.relevance_3
  )



respondents_dec_svy <- svydesign(
  ids = ~1,
  weights = ~weight_use,
  data = respondents_dec2
) 

ft_sample_demo <- respondents_dec_svy %>%
  tbl_svysummary(
    statistic = list(all_categorical() ~ "{n} ({p}%)"),
    percent = "row",
    missing = "no"
  ) %>%
  as_flex_table() %>%
  autofit() %>%
  fontsize(size = 10, part = "all") %>%
  theme_vanilla()

top_titles_within_group <- contact_data %>%
  filter(Respondent == 1) %>%
  filter(!is.na(Group), !is.na(Title)) %>%
  group_by(Group, Title) %>%
  summarise(n = n(), .groups = "drop_last") %>%
  mutate(
    percent = 100 * n / sum(n)
  ) %>%
  arrange(desc(n)) %>%
  slice_head(n = 5) %>%
  ungroup() %>%
  mutate(percent = round(percent, 1))

top_titles_within_group
top_titles_within_group_fmt <- top_titles_within_group %>%
  mutate(
    value = sprintf("%d (%.1f%%)", n, percent)
  ) %>%
  select(Group, Title, value)

top_titles_within_group_fmt

top_orgs_within_group <- contact_data %>%
  filter(Respondent == 1) %>%
  filter(!is.na(Group), !is.na(Organization.Name..Parent.)) %>%
  group_by(Group, Organization.Name..Parent.) %>%
  summarise(n = n(), .groups = "drop_last") %>%
  mutate(
    percent = 100 * n / sum(n)
  ) %>%
  arrange(desc(n)) %>%
  slice_head(n = 5) %>%
  ungroup() %>%
  mutate(percent = round(percent, 1))

top_orgs_within_group

top_org_within_group_fmt <- top_orgs_within_group %>%
  mutate(
    value = sprintf("%d (%.1f%%)", n, percent)
  ) %>%
  select(Group, Organization.Name..Parent., value)

top_org_within_group_fmt


library(flextable)
library(officer)
library(purrr)

tables <- list(
  "Table 1. Numeric balance" = numeric_balance_table,
  "Table 2. Categorical balance" = categorical_balance_table,
  "Table 3. Top State imbalance contributors" = state_level_balance,
  "Table 4. Group imbalance" = group_level_balance,
  "Table 5. Sample Demographics (weighted)" = ft_sample_demo,
  "Table 6. Top titles within group" = top_titles_within_group_fmt,
  "Table 6. Top orgs within group" = top_org_within_group_fmt
)

library(dplyr)
library(scales)

format_p <- function(p, digits = 3, threshold = 1e-3) {
  ifelse(
    is.na(p), NA_character_,
    ifelse(p < threshold, "<0.001",
           format(round(p, digits), nsmall = digits, trim = TRUE))
  )
}

library(dplyr)
library(scales)

format_table <- function(df,
                         digits_p = 3,
                         p_thresh = 1e-3,
                         digits_smd = 3,
                         digits_num = 2,
                         digits_percent = 1) {
  
  # local p-value formatter (so you can't "forget" to define it)
  fmt_p <- function(p) {
    ifelse(
      is.na(p), NA_character_,
      ifelse(p < p_thresh,
             paste0("<", format(p_thresh, scientific = FALSE, trim = TRUE)),
             format(round(p, digits_p), nsmall = digits_p, trim = TRUE))
    )
  }
  
  df %>%
    mutate(
      # p-values
      across(any_of(c("p_value", "p.value")), ~ fmt_p(as.numeric(.))),
      
      # SMDs
      across(any_of(c("smd", "smd_max")), ~ round(as.numeric(.), digits_smd)),
      
      # common numeric columns
      across(any_of(c("statistic", "estimate", "se", "t")), ~ round(as.numeric(.), digits_num)),
      
      # degrees of freedom / counts: comma-separated integers
      across(any_of(c("df_den", "df_num", "n")), ~ comma(round(as.numeric(.), 0))),
      
      # percents
      across(any_of(c("percent")), ~ round(as.numeric(.), digits_percent))
    )
}


tables_fmt <- lapply(tables, function(x) {
  if (inherits(x, "flextable")) {
    x
  } else if (is.data.frame(x)) {
    format_table(x)
  } else {
    x
  }
})

doc <- read_docx()

for (nm in names(tables_fmt)) {
  
  obj <- tables_fmt[[nm]]
  
  # If it's already a flextable, keep it; otherwise convert df -> flextable
  ft <- if (inherits(obj, "flextable")) {
    obj
  } else {
    flextable(obj) |>
      autofit() |>
      fontsize(size = 10, part = "all") |>
      theme_vanilla()
  }
  
  doc <- doc |>
    body_add_par(nm, style = "heading 2") |>
    body_add_flextable(ft) |>
    body_add_par("", style = "Normal")
}

print(doc, target = "tables_for_word.docx")
